/*
Karl Dunkle Werner
May 6, 2018
io_tables_to_stata.do

Purpose: read the BEA's direct requirements IO table and convert to stata

Edited by Hal Gordon
Sept/October 2018

Edited: Mar Reguant, August 2020

*/

clear all
set more off
//-----  Input files -----//
// Domestic total requirements industry-to-industry detail table, after redefinitions
// Downloaded from
// https://apps.bea.gov/industry/xls/io-annual/CxI_Domestic_DR_2007_detail.xlsx
global IO_DR_EXCEL = "$IOraw/IxI_Domestic_2007_detail.xlsx"

//-----  Output files -----//
global IO_DR_STATA = "$IO/IxI_Domestic_2007_detail.dta"
// You could also generate CxI_DR_2007_detail.dta, using CxI_DR_2007_detail.xlsx


program read_io_dr_excel
    // Excel file downloaded from
    // https://apps.bea.gov/industry/xls/io-annual/CxI_Domestic_DR_2007_detail.xlsx
    disp "    reading excel file..."


    if (regexm("$IO_DR_EXCEL", "CxI_DR_2007")) {
        local sheet_name = 2007
        local cellrange = "A5:OA397"
    }
    else if (regexm("$IO_DR_EXCEL", "CxI_Domestic_DR_2007")) {
        local sheet_name = 2007
        local cellrange = "A5:OA394"
    }
	else if (regexm("$IO_DR_EXCEL", "IxI_Domestic_2007")) {
        local sheet_name = 2007
        local cellrange = "A5:OA394"
    }
    else {
        disp as error "This code depends on parameters that are specific to the file being read. Please edit the code."
        error 9999
    }

    import excel using "$IO_DR_EXCEL", ///
        sheet(`sheet_name') cellrange(`cellrange') clear firstrow
    rename Code code
	rename IndustryDescription description
    isid code

    foreach var of varlist * {
        if inlist("`var'", "code", "description") {
            // rename all variables except these two.
            // Can't uses `ds code description, not` because Stata 12 silently
            // truncates local variables.
            continue
        }
        // Get the label, which is where `import excel` put the industry codes
        local vlab: var label `var'
        // Rename with v_<industry code> so reshape can use the "v_"
        rename `var' v`vlab'
    }

    disp "    reshaping long..."
    // Reshape from a matrix to something that's easier to work with in Stata
    quietly reshape long v, i(code description) j(colCode) string
    rename code rowCode
    rename description rowDescription
    // Check that the reshape resulted in the right variables
    quietly ds

    /* Drop these sectors because they're weird and not what we really want.

        S00102	Other federal government enterprises
        S00700	State and local general government
        S00203	Other state and local government enterprises
        S00401	Scrap
        S00402	Used and secondhand goods
        S00300	Noncomparable imports
        S00900	Rest of the world adjustment
        V00100	Compensation of employees
        V00200	Taxes on production and imports, less subsidies
        V00300	Gross operating surplus

    Here are ones that are weird that I'm not (yet) dropping:
        S00101	Federal electric utilities

        S00201	State and local government passenger transit
        S00202	State and local government electric utilities

    */
    quietly drop if inlist(rowCode, "S00102", "S00700", "S00203", "S00401", ///
        "S00402", "S00900", "S00300", "V00100", "V00200") | rowCode == "V00300"
    quietly drop if inlist(colCode, "S00102", "S00700", "S00203", "S00401", ///
        "S00402", "S00900", "S00300", "V00100", "V00200") | colCode == "V00300"

    // Note that if I hadn't dropped the compensation/surplus categories above,
    // coltot would always be 1 and v would be equal to colShare.
    tempvar coltot
    sort colCode
    by colCode: egen `coltot' = total(v)
    quietly replace `coltot' = 1 if `coltot' == 0  // some columns v always 0?
    gen colShare = v / `coltot'

    quietly compress
    isid colCode rowCode
	//collapse (sum) colShare, by(colCode)
	//At this point, the colShares add up to one. That makes sense.
end

program joinby_naics
    // Pull the NAICS crossover Mar has previously created
    rename rowCode IOnaics
    joinby IOnaics using "$crosswalks/IOnaics_to_naics.dta", unmatched(master)
    // Should match all of master data (so no _merge == 1)
    drop _merge Description

    rename IOnaics rowCode
    rename naics rowNaics
    rename colCode IOnaics

    joinby IOnaics using "$crosswalks/IOnaics_to_naics.dta", unmatched(master)
    drop _merge Description
    rename IOnaics colCode
    rename naics colNaics
end


program run_naics_corrections
	//I don't think this is very important since it does not relate to
	//manufacturing
	preserve
    tempfile corrections
    quietly insheet using "$crosswalks/io_naics_corrections.csv", names comma clear
    isid code
    keep code new_naics
    gen rowCode = code
    rename code colCode
    quietly save `corrections'
    restore

    merge m:1 rowCode using "`corrections'", ///
        keepusing(new_naics) keep(master match) nogenerate noreport
    rename rowNaics old_naics
    egen rowNaics = rowfirst(new_naics old_naics)
    drop old_naics new_naics


    // Now do exactly the same for colNaics
    merge m:1 colCode using "`corrections'", ///
        keepusing(new_naics) keep(master match) nogenerate noreport
    rename colNaics old_naics
    egen colNaics = rowfirst(new_naics old_naics)
    drop old_naics new_naics

    // Specifically fix aluminum:
    // The codes in the IO matrix are:
    // 33131A	Alumina refining and primary aluminum production
    // 331314	Secondary smelting and alloying of aluminum
    // 33131B	Aluminum product manufacturing from purchased aluminum
    // These correspond to 2007-NAICS 331311 and 331312; 331314; and 331315,
    // 331316, 331319.
    // Note that 331311 and 331312 merged to become 331313 in the 2012 NAICS.
    // However, the BEA writes:
    //   Primary output of the "secondary smelting and
    //   alloying of aluminum" and "alumina refining and primary aluminum
    //   production" industries is treated as being identical and is reported as
    //   "alumina refining and primary aluminum production" for both industries.
    //   As a result, "secondary smelting and alloying of aluminum" appears as
    //   an industry at the detail level but not as a commodity.
    // So, with the knowledge that I'm eventually trying to match with
    // NAICS 331311, 331312, and 331314, I'm going to assign all of the
    // commodities (rows) to the newer code 331313, and also recode the
    // industries 331311 and 331312 to 331313.

    //quietly replace rowNaics = 331313 if inlist(rowCode, "33131A", "331314")
    //quietly replace colNaics = 331313 if colCode == "33131A"
    if (! regexm("$IO_DR_EXCEL", "CxI")) {
        //disp as error "The code above was written for a commodity-by-" ///
        //    "industry IO table. Please remove 331314 if you have industry-" ///
        //    "by-industry figures."
		//drop if rowCode == "331314"
		//drop if colCode == "331314"
    }

    quietly duplicates drop
    isid colCode rowCode colNaics rowNaics
end

program add_var_labels
    label variable rowCode "Row in direct requirements matrix (commodity input)"
    label variable colCode "Col in direct requirements matrix (industry output)"
    label variable rowDescription "BEA description of rowCode"
    label variable v "\$ of row commodity to create \$1 of col industry output"
    label variable colShare "Share of total v by colCode"
    label variable colNaics "NAICS corresponding to colCode"
    label variable rowNaics "NAICS corresponding to rowCode"
end


read_io_dr_excel
joinby_naics
run_naics_corrections
add_var_labels
save "$IO_DR_STATA", replace
